Libraries required for this analysis

knitr::opts_chunk$set(fig.align="center") 
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr) 
library(ggplot2)
library(magrittr)  
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)

theme_set(theme_light())


source('helper_functions.R')

In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then asked the user to evaluate the tool on a variety of metrics (confidence in understanding data, confidence in answer, efficiency, ease of use, utility, and overall).

Given a search algorithm (bfs or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s average score for a given metric. In addition, we would like to know if the choice of search algorithm and oracle has any meaningful impact on a user’s ratong for these metrics.

Read in and clean data

analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
confidence_metrics = c("confidence.udata", "confidence.ans")
preference_metrics = c("efficiency", "ease.of.use", "utility", "overall")

user_response_data <- read.csv('split_by_participant_groups/ptask_responses.csv')
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
user_response_data[,analyses] <- lapply(user_response_data[,analyses],ordered)
user_response_data <- user_response_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    task = as.factor(task)
  )

models <- list()

search_differences <- list()
oracle_differences <- list()
alg_differences <- list()
participant_group_differences <- list()

seed = 12

Analysis for user responses

Confidence in Understanding Data: Building a Model

models$confidence_udata <- brm(
    formula = bf(confidence.udata ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
    prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/confidence_udata",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$confidence_udata)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: confidence.udata ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.04      0.16     0.75     1.38 1.00      972     1624
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -2.01      0.39    -2.80    -1.26 1.00     1608
## Intercept[2]              -0.84      0.37    -1.61    -0.13 1.00     1563
## Intercept[3]               1.33      0.37     0.59     2.06 1.00     1565
## datasetmovies              0.10      0.30    -0.48     0.70 1.00     1359
## oracledziban               0.06      0.44    -0.80     0.89 1.00     1204
## searchdfs                 -0.42      0.43    -1.27     0.40 1.00     1272
## task2.RetrieveValue        0.29      0.21    -0.11     0.70 1.00     3249
## task3.Prediction           0.16      0.21    -0.23     0.56 1.00     3486
## task4.Exploration          0.61      0.21     0.19     1.01 1.00     3318
## oracledziban:searchdfs     0.70      0.60    -0.47     1.89 1.00     1232
##                        Tail_ESS
## Intercept[1]               2093
## Intercept[2]               2149
## Intercept[3]               2240
## datasetmovies              1429
## oracledziban               1403
## searchdfs                  1577
## task2.RetrieveValue        2601
## task3.Prediction           2538
## task4.Exploration          2197
## oracledziban:searchdfs     1304
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$confidence_udata)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$confidence_udata,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$confidence_udata,
  pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for confidence in understanding the data using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

confidence_udata_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_udata, NULL, "Oracle/Search Combination", "Rating")
confidence_udata_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

confidence_udata_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  1.01   0.688  1.33    0.95 mean   qi       
## 2 bfs    dziban     1.05   0.721  1.34    0.95 mean   qi       
## 3 dfs    compassql  0.804  0.426  1.13    0.95 mean   qi       
## 4 dfs    dziban     1.17   0.859  1.48    0.95 mean   qi       
## 5 bfs    compassql  1.01   0.906  1.12    0.5  mean   qi       
## 6 bfs    dziban     1.05   0.941  1.15    0.5  mean   qi       
## 7 dfs    compassql  0.804  0.691  0.926   0.5  mean   qi       
## 8 dfs    dziban     1.17   1.06   1.28    0.5  mean   qi
## Saving 7 x 5 in image

Confidence in Understanding Data: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

confidence_udata_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_udata, seed = seed, re_formula = NA) 
confidence_udata_predictive_data$alg <- paste(confidence_udata_predictive_data$search, confidence_udata_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$confidence_udata <- user_response_expected_diff_in_mean_plot(confidence_udata_predictive_data, "search", "confidence.udata", "Difference in Average Confidence in Understanding Data (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$confidence_udata$plot

Differences in user score by oracle.

oracle_differences$confidence_udata <- user_response_expected_diff_in_mean_plot(confidence_udata_predictive_data, "oracle", "confidence.udata", "Difference in Average Confidence in Understanding Data (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$confidence_udata$plot

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

confidence_udata_predictive_data_subset <- subset(confidence_udata_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))

alg_differences$confidence_udata <- user_response_expected_diff_in_mean_plot(confidence_udata_predictive_data_subset, "alg", "confidence.udata", "Difference in Average Confidence in Understanding Data (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$confidence_udata$plot

Differences in user score by participant group

participant_group_differences$confidence_udata <- user_response_expected_diff_in_mean_plot(confidence_udata_predictive_data, "participant_group", "confidence.udata", "Difference in Average Confidence in Understanding Data (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$confidence_udata$plot

Confidence in Answer: Building a Model

models$confidence_ans <- brm(
    formula = bf(confidence.ans ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
    prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/confidence_ans",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$confidence_ans)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: confidence.ans ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.55      0.14     0.27     0.82 1.00      744      662
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -3.24      0.38    -4.03    -2.49 1.00     2489
## Intercept[2]              -2.41      0.29    -2.99    -1.86 1.00     2552
## Intercept[3]              -1.57      0.27    -2.11    -1.06 1.00     2354
## Intercept[4]               0.17      0.25    -0.32     0.67 1.00     2519
## datasetmovies             -0.16      0.19    -0.55     0.23 1.00     2629
## oracledziban               0.24      0.28    -0.31     0.79 1.00     2084
## searchdfs                  0.12      0.28    -0.41     0.66 1.00     2064
## task2.RetrieveValue       -0.30      0.21    -0.69     0.11 1.00     2839
## task3.Prediction          -1.04      0.20    -1.44    -0.64 1.00     2829
## task4.Exploration         -0.62      0.20    -1.01    -0.23 1.00     2993
## oracledziban:searchdfs    -0.02      0.39    -0.79     0.81 1.00     1992
##                        Tail_ESS
## Intercept[1]               2504
## Intercept[2]               2276
## Intercept[3]               2530
## Intercept[4]               2583
## datasetmovies              2500
## oracledziban               2087
## searchdfs                  2504
## task2.RetrieveValue        2363
## task3.Prediction           2476
## task4.Exploration          2197
## oracledziban:searchdfs     1885
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$confidence_ans)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$confidence_ans,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$confidence_ans,
  pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for confidence in answer using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

confidence_ans_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_ans, NULL, "Oracle/Search Combination", "Rating")
confidence_ans_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

confidence_ans_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql   1.01  0.688   1.30   0.95 mean   qi       
## 2 bfs    dziban      1.16  0.853   1.44   0.95 mean   qi       
## 3 dfs    compassql   1.08  0.765   1.37   0.95 mean   qi       
## 4 dfs    dziban      1.22  0.922   1.5    0.95 mean   qi       
## 5 bfs    compassql   1.01  0.906   1.12   0.5  mean   qi       
## 6 bfs    dziban      1.16  1.06    1.25   0.5  mean   qi       
## 7 dfs    compassql   1.08  0.985   1.19   0.5  mean   qi       
## 8 dfs    dziban      1.22  1.12    1.31   0.5  mean   qi
## Saving 7 x 5 in image

Confidence in Answer: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

confidence_ans_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_ans, seed = seed, re_formula = NA) 
confidence_ans_predictive_data$alg <- paste(confidence_ans_predictive_data$search, confidence_ans_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$confidence_ans <- user_response_expected_diff_in_mean_plot(confidence_ans_predictive_data, "search", "confidence.ans", "Difference in Average Confidence in Answer (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$confidence_ans$plot

Differences in user score by oracle.

oracle_differences$confidence_ans <- user_response_expected_diff_in_mean_plot(confidence_ans_predictive_data, "oracle", "confidence.ans", "Difference in Average Confidence in Answer (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$confidence_ans$plot

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

confidence_ans_predictive_data_subset <- subset(confidence_ans_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$confidence_ans <- user_response_expected_diff_in_mean_plot(confidence_ans_predictive_data_subset, "alg", "confidence.ans", "Difference in Average Confidence in Answer (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$confidence_ans$plot

Differences in user score by participant group

participant_group_differences$confidence_ans <- user_response_expected_diff_in_mean_plot(confidence_ans_predictive_data, "participant_group", "confidence.ans", "Difference in Average Confidence in Answer (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$confidence_ans$plot

Efficiency: Building a Model

filename = "efficiency"
models$efficiency <- brm(
    formula = bf(efficiency ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/efficiency",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$efficiency)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: efficiency ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.16      0.16     0.88     1.50 1.00     1027     1369
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -2.58      0.43    -3.49    -1.76 1.00     1163
## Intercept[2]              -1.01      0.39    -1.81    -0.26 1.00     1002
## Intercept[3]              -0.14      0.39    -0.91     0.64 1.00     1003
## Intercept[4]               1.07      0.40     0.29     1.86 1.00     1007
## datasetmovies              0.31      0.32    -0.31     0.95 1.00     1093
## oracledziban              -0.15      0.46    -1.03     0.74 1.00      969
## searchdfs                 -1.21      0.47    -2.17    -0.31 1.00     1021
## task2.RetrieveValue       -0.27      0.19    -0.64     0.12 1.00     3059
## task3.Prediction           0.27      0.19    -0.10     0.65 1.00     3024
## task4.Exploration          0.46      0.20     0.08     0.85 1.00     3251
## oracledziban:searchdfs     0.86      0.64    -0.43     2.15 1.00      978
##                        Tail_ESS
## Intercept[1]               1461
## Intercept[2]               1361
## Intercept[3]               1337
## Intercept[4]               1307
## datasetmovies              1658
## oracledziban               1427
## searchdfs                  1526
## task2.RetrieveValue        2384
## task3.Prediction           2162
## task4.Exploration          2232
## oracledziban:searchdfs     1427
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$efficiency)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$efficiency,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$efficiency,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for efficiency using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

efficiency_plot <- user_response_posterior_draws_plot(user_response_data, models$efficiency, NULL, "Oracle/Search Combination", "Rating")
efficiency_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

efficiency_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower  .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  0.745  0.172   1.27     0.95 mean   qi       
## 2 bfs    dziban     0.635  0.0882  1.16     0.95 mean   qi       
## 3 dfs    compassql -0.268 -0.809   0.309    0.95 mean   qi       
## 4 dfs    dziban     0.324 -0.281   0.906    0.95 mean   qi       
## 5 bfs    compassql  0.745  0.562   0.953    0.5  mean   qi       
## 6 bfs    dziban     0.635  0.441   0.824    0.5  mean   qi       
## 7 dfs    compassql -0.268 -0.456  -0.0735   0.5  mean   qi       
## 8 dfs    dziban     0.324  0.125   0.516    0.5  mean   qi
## Saving 7 x 5 in image

Efficiency: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

efficiency_predictive_data <- user_response_data %>% add_predicted_draws(models$efficiency, seed = seed, re_formula = NA) 
efficiency_predictive_data$alg <- paste(efficiency_predictive_data$search, efficiency_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$efficiency <- user_response_expected_diff_in_mean_plot(efficiency_predictive_data, "search", "efficiency", "Difference in Average Efficiency (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$efficiency$plot

Differences in user score by oracle.

oracle_differences$efficiency <- user_response_expected_diff_in_mean_plot(efficiency_predictive_data, "oracle", "efficiency", "Difference in Average Efficiency (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$efficiency$plot

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

efficiency_predictive_data_data_subset <- subset(efficiency_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$efficiency <- user_response_expected_diff_in_mean_plot(efficiency_predictive_data_data_subset, "alg", "efficiency", "Difference in Average Efficiency (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$efficiency$plot

Differences in user score by participant group

participant_group_differences$efficiency <- user_response_expected_diff_in_mean_plot(efficiency_predictive_data, "participant_group", "efficiency", "Difference in Average Efficiency (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$efficiency$plot

Ease of Use: Building a Model

models$ease_of_use <- brm(
    formula = bf(ease.of.use ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/ease_of_use",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$ease_of_use)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: ease.of.use ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.06      0.15     0.79     1.40 1.01      694     1152
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -2.77      0.43    -3.69    -1.96 1.00     1182
## Intercept[2]              -1.36      0.38    -2.14    -0.61 1.00     1097
## Intercept[3]              -0.43      0.37    -1.15     0.33 1.00     1068
## Intercept[4]               1.44      0.39     0.70     2.22 1.00     1115
## datasetmovies              0.48      0.31    -0.12     1.07 1.00      913
## oracledziban              -0.37      0.44    -1.26     0.49 1.00      847
## searchdfs                 -1.26      0.43    -2.13    -0.43 1.00      900
## task2.RetrieveValue        0.20      0.19    -0.18     0.58 1.00     3202
## task3.Prediction           0.35      0.20    -0.04     0.75 1.00     2983
## task4.Exploration          0.43      0.20     0.04     0.84 1.00     3215
## oracledziban:searchdfs     0.86      0.61    -0.34     2.04 1.01      812
##                        Tail_ESS
## Intercept[1]               1507
## Intercept[2]               1607
## Intercept[3]               1548
## Intercept[4]               1877
## datasetmovies              1606
## oracledziban               1166
## searchdfs                  1163
## task2.RetrieveValue        2500
## task3.Prediction           2034
## task4.Exploration          2134
## oracledziban:searchdfs     1213
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$ease_of_use)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$ease_of_use,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$ease_of_use,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for ease of use using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

ease_of_use_plot <- user_response_posterior_draws_plot(user_response_data, models$ease_of_use, NULL, "Oracle/Search Combination", "Rating")
ease_of_use_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

ease_of_use_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql 0.940   0.516   1.31    0.95 mean   qi       
## 2 bfs    dziban    0.722   0.279   1.10    0.95 mean   qi       
## 3 dfs    compassql 0.0744 -0.397   0.559   0.95 mean   qi       
## 4 dfs    dziban    0.432  -0.0781  0.875   0.95 mean   qi       
## 5 bfs    compassql 0.940   0.812   1.08    0.5  mean   qi       
## 6 bfs    dziban    0.722   0.588   0.868   0.5  mean   qi       
## 7 dfs    compassql 0.0744 -0.0882  0.25    0.5  mean   qi       
## 8 dfs    dziban    0.432   0.281   0.594   0.5  mean   qi
## Saving 7 x 5 in image

Ease of Use: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

ease_of_use_predictive_data <- user_response_data %>% add_predicted_draws(models$ease_of_use, seed = seed, re_formula = NA) 
ease_of_use_predictive_data$alg <- paste(ease_of_use_predictive_data$search, ease_of_use_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$ease_of_use <- user_response_expected_diff_in_mean_plot(ease_of_use_predictive_data, "search", "ease.of.use", "Difference in Average Ease of Use (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$ease_of_use$plot

Differences in user score by oracle.

oracle_differences$ease_of_use <- user_response_expected_diff_in_mean_plot(ease_of_use_predictive_data, "oracle", "ease.of.use", "Difference in Average Ease of Use (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$ease_of_use$plot

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

ease_of_use_predictive_data_subset <- subset(ease_of_use_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))

alg_differences$ease_of_use <- user_response_expected_diff_in_mean_plot(ease_of_use_predictive_data_subset, "alg", "ease.of.use", "Difference in Average Ease of Use (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$ease_of_use$plot

Differences in user score by participant group

participant_group_differences$ease_of_use <- user_response_expected_diff_in_mean_plot(ease_of_use_predictive_data, "participant_group", "ease.of.use", "Difference in Average Ease of Use (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$ease_of_use$plot

Utility: Building a Model

models$utility <- brm(
    formula = bf(utility ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/utility",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$utility)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: utility ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.97      0.14     0.71     1.27 1.01      768     1528
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -1.79      0.37    -2.52    -1.07 1.00     1347
## Intercept[2]              -0.54      0.35    -1.23     0.14 1.00     1408
## Intercept[3]               0.11      0.35    -0.58     0.80 1.00     1392
## Intercept[4]               1.41      0.36     0.71     2.13 1.00     1466
## datasetmovies              0.38      0.28    -0.15     0.93 1.00     1375
## oracledziban               0.06      0.39    -0.74     0.82 1.00     1186
## searchdfs                 -0.83      0.39    -1.58    -0.07 1.00     1228
## task2.RetrieveValue       -0.15      0.19    -0.53     0.21 1.00     3553
## task3.Prediction           0.34      0.19    -0.03     0.72 1.00     3373
## task4.Exploration          0.65      0.19     0.27     1.03 1.00     3269
## oracledziban:searchdfs     0.52      0.57    -0.55     1.65 1.00     1107
##                        Tail_ESS
## Intercept[1]               1805
## Intercept[2]               1980
## Intercept[3]               1957
## Intercept[4]               2029
## datasetmovies              1771
## oracledziban               1610
## searchdfs                  1708
## task2.RetrieveValue        1934
## task3.Prediction           2623
## task4.Exploration          2692
## oracledziban:searchdfs     1526
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$utility)

s plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$utility,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$utility,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for Utility using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

utility_plot <- user_response_posterior_draws_plot(user_response_data, models$utility, NULL, "Oracle/Search Combination", "Rating")
utility_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

utility_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower  .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  0.558 -0.0156  1.08     0.95 mean   qi       
## 2 bfs    dziban     0.620  0.0588  1.12     0.95 mean   qi       
## 3 dfs    compassql -0.200 -0.721   0.353    0.95 mean   qi       
## 4 dfs    dziban     0.329 -0.25    0.859    0.95 mean   qi       
## 5 bfs    compassql  0.558  0.375   0.75     0.5  mean   qi       
## 6 bfs    dziban     0.620  0.441   0.809    0.5  mean   qi       
## 7 dfs    compassql -0.200 -0.397  -0.0147   0.5  mean   qi       
## 8 dfs    dziban     0.329  0.141   0.516    0.5  mean   qi
## Saving 7 x 5 in image

Utility: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

utility_predictive_data <- user_response_data %>% add_predicted_draws(models$utility, seed = seed, re_formula = NA) 
utility_predictive_data$alg <- paste(utility_predictive_data$search, utility_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$utility <- user_response_expected_diff_in_mean_plot(utility_predictive_data, "search", "utility", "Difference in Average Utility (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$utility$plot

Differences in user score by oracle.

oracle_differences$utility <- user_response_expected_diff_in_mean_plot(utility_predictive_data, "oracle", "utility", "Difference in Average Utility (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$utility$plot

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

utility_predictive_data_subset <- subset(utility_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$utility <- user_response_expected_diff_in_mean_plot(utility_predictive_data_subset, "alg", "utility", "Difference in Average Utility (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$utility$plot

Differences in user score by participant group

participant_group_differences$utility <- user_response_expected_diff_in_mean_plot(utility_predictive_data, "participant_group", "utility", "Difference in Average Utility (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$utility$plot

Overall: Building a Model

models$overall <- brm(
    formula = bf(overall ~ dataset + oracle * search + task + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/overall",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$overall)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: overall ~ dataset + oracle * search + task + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 264) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 66) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.49      0.20     1.13     1.93 1.00      712     1065
## 
## Population-Level Effects: 
##                        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]              -2.94      0.52    -4.00    -1.94 1.00      667
## Intercept[2]              -1.58      0.49    -2.54    -0.64 1.00      620
## Intercept[3]              -0.31      0.48    -1.25     0.64 1.00      600
## Intercept[4]               1.84      0.50     0.88     2.83 1.00      608
## datasetmovies             -0.05      0.40    -0.86     0.72 1.00      396
## oracledziban               0.06      0.56    -1.04     1.19 1.00      462
## searchdfs                 -0.85      0.56    -1.92     0.24 1.00      467
## task2.RetrieveValue       -0.02      0.20    -0.41     0.38 1.00     2167
## task3.Prediction           0.43      0.20     0.03     0.82 1.00     2125
## task4.Exploration          0.66      0.21     0.24     1.07 1.00     2149
## oracledziban:searchdfs     0.56      0.78    -0.96     2.06 1.00      472
##                        Tail_ESS
## Intercept[1]               1177
## Intercept[2]               1123
## Intercept[3]               1082
## Intercept[4]               1052
## datasetmovies              1001
## oracledziban                780
## searchdfs                   992
## task2.RetrieveValue        2138
## task3.Prediction           2355
## task4.Exploration          1898
## oracledziban:searchdfs      884
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

plot(models$overall)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$overall,
  pars = c("b_Intercept[1]",
           "b_Intercept[2]",
           "b_Intercept[3]",
           "b_Intercept[4]"),
  fixed = TRUE
)

pairs(
  models$overall,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a average response for Overall using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

overall_plot <- user_response_posterior_draws_plot(user_response_data, models$overall, NULL, "Oracle/Search Combination", "Rating")
overall_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

overall_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs    compassql  0.707  0.203   1.12    0.95 mean   qi       
## 2 bfs    dziban     0.742  0.279   1.13    0.95 mean   qi       
## 3 dfs    compassql  0.196 -0.353   0.721   0.95 mean   qi       
## 4 dfs    dziban     0.580  0.0469  1       0.95 mean   qi       
## 5 bfs    compassql  0.707  0.562   0.875   0.5  mean   qi       
## 6 bfs    dziban     0.742  0.618   0.882   0.5  mean   qi       
## 7 dfs    compassql  0.196  0       0.397   0.5  mean   qi       
## 8 dfs    dziban     0.580  0.422   0.75    0.5  mean   qi
## Saving 7 x 5 in image

Overall: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

overall_predictive_data <- user_response_data %>% add_predicted_draws(models$overall, seed = seed, re_formula = NA) 
overall_predictive_data$alg <- paste(overall_predictive_data$search, overall_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$overall <- user_response_expected_diff_in_mean_plot(overall_predictive_data, "search", "overall", "Difference in Average Overall (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'search' (override with `.groups` argument)
search_differences$overall$plot

Differences in user score by oracle.

oracle_differences$overall <- overall_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$metric = "overall"

oracle_differences$overall %>%
      ggplot(aes(x = diff_in_rating, y = "overall")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$overall[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

oracle_differences$overall <- user_response_expected_diff_in_mean_plot(overall_predictive_data, "oracle", "overall", "Difference in Average Overall (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$plot

Differences in user score by search and oracle combination (dfs compassql vs bfs dziban only)

overall_predictive_data_subset <- subset(overall_predictive_data, alg %in% c("dfs compassql", "bfs dziban"))
alg_differences$overall <- user_response_expected_diff_in_mean_plot(overall_predictive_data_subset, "alg", "overall", "Difference in Average Overall (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'alg' (override with `.groups` argument)
alg_differences$overall$plot

Differences in user score by participant group

participant_group_differences$overall <- user_response_expected_diff_in_mean_plot(overall_predictive_data, "participant_group", "overall", "Difference in Average Overall (Rating)", "Task", NULL)
## `summarise()` regrouping output by 'participant_group' (override with `.groups` argument)
participant_group_differences$overall$plot

Summary Plots

Putting the all of the plots for search algorithm and oracle differences together, split by whether the rating metric is of type confidence or preference We’ll start with differences in search algorithms.

Differences in Search Algorithms

combined_search_differences <- rbind(
  search_differences$confidence_udata$differences, 
  search_differences$confidence_ans$differences, 
  search_differences$efficiency$differences,
  search_differences$ease_of_use$differences, 
  search_differences$utility$differences, 
  search_differences$overall$differences)

combined_search_differences$metric <- factor(combined_search_differences$metric, levels=rev(analyses))
combined_search_differences_confidence <- subset(combined_search_differences, metric %in% confidence_metrics)
search_differences_plot_confidence <- combined_search_differences_confidence %>%
      ggplot(aes(x = difference, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_search_differences_confidence[1,'search'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

search_differences_plot_confidence

View intervals

fit_info_search_differences_confidence <- combined_search_differences_confidence %>% group_by(search, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_search_differences_confidence
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    metric           difference  .lower .upper .width .point .interval
##   <chr>     <fct>                 <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs - dfs confidence.ans      -0.0625 -0.364  0.235    0.95 mean   qi       
## 2 bfs - dfs confidence.udata     0.0465 -0.273  0.371    0.95 mean   qi       
## 3 bfs - dfs confidence.ans      -0.0625 -0.167  0.0455   0.5  mean   qi       
## 4 bfs - dfs confidence.udata     0.0465 -0.0682 0.159    0.5  mean   qi
combined_search_differences_preference <- subset(combined_search_differences, metric %in% preference_metrics)
search_differences_plot_preference <- combined_search_differences_preference %>%
      ggplot(aes(x = difference, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_search_differences_preference[1,'search'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()
search_differences_plot_preference

View intervals

fit_info_search_differences_preference <- combined_search_differences_preference %>% group_by(search, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_search_differences_preference
## # A tibble: 8 x 8
## # Groups:   search [1]
##   search    metric      difference  .lower .upper .width .point .interval
##   <chr>     <fct>            <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 bfs - dfs overall          0.343 -0.144   0.826   0.95 mean   qi       
## 2 bfs - dfs utility          0.533 -0.0227  1.08    0.95 mean   qi       
## 3 bfs - dfs ease.of.use      0.580  0.152   1.03    0.95 mean   qi       
## 4 bfs - dfs efficiency       0.669  0.0758  1.24    0.95 mean   qi       
## 5 bfs - dfs overall          0.343  0.189   0.508   0.5  mean   qi       
## 6 bfs - dfs utility          0.533  0.348   0.727   0.5  mean   qi       
## 7 bfs - dfs ease.of.use      0.580  0.432   0.720   0.5  mean   qi       
## 8 bfs - dfs efficiency       0.669  0.470   0.871   0.5  mean   qi

Differences in Oracle

combined_oracle_differences <- rbind(
  oracle_differences$confidence_udata$differences, 
  oracle_differences$confidence_ans$differences, 
  oracle_differences$efficiency$differences,
  oracle_differences$ease_of_use$differences, 
  oracle_differences$utility$differences, 
  oracle_differences$overall$differences)

combined_oracle_differences$metric <- factor(combined_oracle_differences$metric, levels=rev(analyses))
combined_oracle_differences_confidence <- subset(combined_oracle_differences, metric %in% confidence_metrics)
oracle_differences_plot_confidence <- combined_oracle_differences_confidence %>%
      ggplot(aes(x = difference, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_oracle_differences_confidence[1,'oracle'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

oracle_differences_plot_confidence

View intervals

fit_info_oracle_differences_confidence <- combined_oracle_differences_confidence %>% group_by(oracle, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_oracle_differences_confidence
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle          metric       difference  .lower .upper .width .point .interval
##   <chr>           <fct>             <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - compa… confidence.…      0.140 -0.144   0.439   0.95 mean   qi       
## 2 dziban - compa… confidence.…      0.203 -0.129   0.530   0.95 mean   qi       
## 3 dziban - compa… confidence.…      0.140  0.0379  0.235   0.5  mean   qi       
## 4 dziban - compa… confidence.…      0.203  0.0909  0.311   0.5  mean   qi
combined_oracle_differences_preference <- subset(combined_oracle_differences, metric %in% preference_metrics)
oracle_differences_plot_preference <- combined_oracle_differences_preference %>%
      ggplot(aes(x = difference, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_oracle_differences_preference[1,'oracle'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()
oracle_differences_plot_preference

View intervals

fit_info_oracle_differences_preference <- combined_oracle_differences_preference %>% group_by(oracle, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_oracle_differences_preference
## # A tibble: 8 x 8
## # Groups:   oracle [1]
##   oracle            metric     difference  .lower .upper .width .point .interval
##   <chr>             <fct>           <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - compass… overall        0.220  -0.265   0.697   0.95 mean   qi       
## 2 dziban - compass… utility        0.312  -0.227   0.841   0.95 mean   qi       
## 3 dziban - compass… ease.of.u…     0.0874 -0.364   0.561   0.95 mean   qi       
## 4 dziban - compass… efficiency     0.261  -0.303   0.826   0.95 mean   qi       
## 5 dziban - compass… overall        0.220   0.0530  0.386   0.5  mean   qi       
## 6 dziban - compass… utility        0.312   0.129   0.492   0.5  mean   qi       
## 7 dziban - compass… ease.of.u…     0.0874 -0.0682  0.242   0.5  mean   qi       
## 8 dziban - compass… efficiency     0.261   0.0682  0.455   0.5  mean   qi

dfs compassql vs bfs dziban

combined_alg_differences <- rbind(
  alg_differences$confidence_udata$differences, 
  alg_differences$confidence_ans$differences, 
  alg_differences$efficiency$differences,
  alg_differences$ease_of_use$differences, 
  alg_differences$utility$differences, 
  alg_differences$overall$differences)

combined_alg_differences$metric <- factor(combined_alg_differences$metric, levels=rev(analyses))
combined_alg_differences_confidence <- subset(combined_alg_differences, metric %in% confidence_metrics)
alg_differences_plot_confidence <- combined_alg_differences_confidence %>%
      ggplot(aes(x = difference, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_alg_differences_confidence[1,'alg'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

alg_differences_plot_confidence

View intervals

fit_info_alg_differences_confidence <- combined_alg_differences_confidence %>% group_by(alg, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_alg_differences_confidence
## # A tibble: 4 x 8
## # Groups:   alg [1]
##   alg              metric      difference .lower  .upper .width .point .interval
##   <chr>            <fct>            <dbl>  <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 dfs compassql -… confidence…    -0.0757 -0.5    0.338    0.95 mean   qi       
## 2 dfs compassql -… confidence…    -0.242  -0.721  0.235    0.95 mean   qi       
## 3 dfs compassql -… confidence…    -0.0757 -0.210  0.0588   0.5  mean   qi       
## 4 dfs compassql -… confidence…    -0.242  -0.397 -0.0882   0.5  mean   qi
combined_alg_differences_preference <- subset(combined_alg_differences, metric %in% preference_metrics)
alg_differences_plot_preference <- combined_alg_differences_preference %>%
      ggplot(aes(x = difference, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_alg_differences_preference[1,'alg'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()
alg_differences_plot_preference

View intervals

fit_info_alg_differences_preference <- combined_alg_differences_preference %>% group_by(alg, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_alg_differences_preference
## # A tibble: 8 x 8
## # Groups:   alg [1]
##   alg                metric    difference .lower  .upper .width .point .interval
##   <chr>              <fct>          <dbl>  <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 dfs compassql - b… overall       -0.546 -1.22   0.132    0.95 mean   qi       
## 2 dfs compassql - b… utility       -0.820 -1.53  -0.0588   0.95 mean   qi       
## 3 dfs compassql - b… ease.of.…     -0.648 -1.28  -0.0294   0.95 mean   qi       
## 4 dfs compassql - b… efficien…     -0.903 -1.68  -0.0732   0.95 mean   qi       
## 5 dfs compassql - b… overall       -0.546 -0.779 -0.309    0.5  mean   qi       
## 6 dfs compassql - b… utility       -0.820 -1.07  -0.574    0.5  mean   qi       
## 7 dfs compassql - b… ease.of.…     -0.648 -0.868 -0.426    0.5  mean   qi       
## 8 dfs compassql - b… efficien…     -0.903 -1.19  -0.632    0.5  mean   qi

Differences in Participant Group

combined_participant_group_differences <- rbind(
  participant_group_differences$confidence_udata$differences, 
  participant_group_differences$confidence_ans$differences, 
  participant_group_differences$efficiency$differences,
  participant_group_differences$ease_of_use$differences, 
  participant_group_differences$utility$differences, 
  participant_group_differences$overall$differences)

combined_participant_group_differences$metric <- factor(combined_participant_group_differences$metric, levels=rev(analyses))
combined_participant_group_differences_confidence <- subset(combined_participant_group_differences, metric %in% confidence_metrics)
participant_group_differences_plot_confidence <- combined_participant_group_differences_confidence %>%
      ggplot(aes(x = difference, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_participant_group_differences_confidence[1,'participant_group'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

participant_group_differences_plot_confidence

View intervals

fit_info_participant_group_differences_confidence <- combined_participant_group_differences_confidence %>% group_by(participant_group, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_participant_group_differences_confidence
## # A tibble: 4 x 8
## # Groups:   participant_group [1]
##   participant_group  metric    difference  .lower .upper .width .point .interval
##   <chr>              <fct>          <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 student - profess… confiden…    0.00390 -0.163  0.172    0.95 mean   qi       
## 2 student - profess… confiden…    0.00481 -0.139  0.150    0.95 mean   qi       
## 3 student - profess… confiden…    0.00390 -0.0538 0.0611   0.5  mean   qi       
## 4 student - profess… confiden…    0.00481 -0.0447 0.0534   0.5  mean   qi
combined_participant_group_differences_preference <- subset(combined_participant_group_differences, metric %in% preference_metrics)
participant_group_differences_plot_preference <- combined_participant_group_differences_preference %>%
      ggplot(aes(x = difference, y = metric)) +
      ylab("Confidence") +
      xlab(paste0("Expected Difference in Rating (",combined_participant_group_differences_preference[1,'participant_group'],")")) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()
participant_group_differences_plot_preference

View intervals

fit_info_participant_group_differences_preference <- combined_participant_group_differences_preference %>% group_by(participant_group, metric) %>% mean_qi(difference, .width = c(.95, .5))
fit_info_participant_group_differences_preference
## # A tibble: 8 x 8
## # Groups:   participant_group [1]
##   participant_group   metric   difference  .lower .upper .width .point .interval
##   <chr>               <fct>         <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 student - professi… overall    0.00821  -0.155  0.182    0.95 mean   qi       
## 2 student - professi… utility   -0.00242  -0.242  0.238    0.95 mean   qi       
## 3 student - professi… ease.of…   0.000685 -0.189  0.198    0.95 mean   qi       
## 4 student - professi… efficie…   0.00465  -0.213  0.237    0.95 mean   qi       
## 5 student - professi… overall    0.00821  -0.0505 0.0649   0.5  mean   qi       
## 6 student - professi… utility   -0.00242  -0.0841 0.0780   0.5  mean   qi       
## 7 student - professi… ease.of…   0.000685 -0.0659 0.0665   0.5  mean   qi       
## 8 student - professi… efficie…   0.00465  -0.0702 0.0798   0.5  mean   qi